library(ggplot2)
library(tidyverse)
library(lubridate)

Accidents_Original <- readRDS("farsp.RDS")

alcohol <- read.csv("alcohol.csv")
alcohol <- alcohol %>% 
  mutate(description = if_else(description %in% c("Unknown", "Not reported"), NA, description))

body_typ <- read.csv("body_typ.csv")
body_typ <- body_typ %>% 
  mutate(description = if_else(description %in% c("Unknown body type", "Not Reported"), NA, description))

col_names <- read.csv("column_name_description.csv")

inj_sev <- read.csv("inj_sev.csv")
inj_sev <- inj_sev %>% 
  mutate(description = if_else(description %in% c("Unknown/Not Reported", "N/A"), NA, description))

man_coll <- read.csv("man_coll.csv")
man_coll <- man_coll %>% 
  mutate(description = if_else(description %in% c("Not Reported", "Unknown"), NA, description))

sex <- read.csv("sex.csv")
sex <- sex %>% 
  mutate(description = if_else(description %in% c("not reported", "unknown"), NA, description))

state_code <- read.csv("state_code.csv")

remember to remove setwd

col_names <- setNames(col_names$column_name, col_names$description)
names(col_names) <- gsub(" ", "_", names(col_names))

Names_Changed <- Accidents_Original %>%
  rename(alcohol = drinking) %>% 
  left_join(state_code, by = c("state" = "state_code")) %>%
  left_join(sex, by = "sex") %>%
  left_join(man_coll, by = "man_coll") %>%
  left_join(inj_sev, by = "inj_sev") %>%
  left_join(body_typ, by = "body_typ") %>%
  left_join(alcohol, by = "alcohol") %>%
    mutate(across(c(year, month, day, hour, minute, age), ~replace(., . == 99, NA)),
         across(c(age), ~replace(., . == 999, NA))) %>%
  drop_na(c("year", "month", "day", "hour", "minute")) %>% 
  mutate(
    state = as.character(state_name),
    sex = as.character(description.x),
    man_coll = as.character(description.y),
    alcohol = as.character(description),
    inj_sev = as.character(description.x.x),
    body_typ = as.character(description.y.y),
    Datetime = ymd_hm(paste(year, month, day, hour, minute)),
    Month_Name = month(month, label = TRUE)
  ) %>%
  rename(!!!col_names) %>% 
  mutate(
    State = as.factor(State),
    County = as.factor(County),
    Manner_of_collision = as.factor(Manner_of_collision),
    Vehicle_body_type = as.factor(Vehicle_body_type),
    Sex = as.factor(Sex),
    Injury_severity = factor(Injury_severity, levels = c("No Apparent Injury", "Injured, Severity Unknown", "Possible Injury", "Suspected Minor Injury", "Suspected Serious Injury", "Non-incapacitating injury", "Fatal Injury", "Died Prior to Crash")),
    Injury_Severity_Basic = if_else(Injury_severity %in% c("Fatal Injury", "Died Prior to Crash"), "Fatal", "Non-Fatal"),
    Datetime = as.POSIXct(Datetime),
    Date = as.Date(Datetime),
    Accident_weekday = factor(wday(Datetime, label = TRUE), levels = c("Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat")),
    Year = year(Datetime)
    ) %>%
  filter(Age < 100) %>%
  select(-State, -description.x, -description.y, -description.x.x, -description.y.y,-description)

Graph 1: Number of Accidents per State

ggplot(Names_Changed %>% filter(!is.na(state_name)), aes(x = state_name)) +
geom_bar() +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
ggtitle("Number of Accidents per State")

Graph 2: Number of Accidents per Month

ggplot(Names_Changed %>% filter(!is.na(Datetime)), aes(x = month(Datetime, label = TRUE))) +
geom_bar() +
ggtitle("Number of Accidents per Month")

Graph 3: Accident Rates Over Time

ggplot(Names_Changed, aes(x = Date, y = after_stat(count))) +
geom_line(stat = "count") +
labs(x = "Date", y = "Accidents", title = "Accident Rates Over Time")

Graph 4: Number of Accidents per Hour

ggplot(Names_Changed %>% filter(!is.na(Accident_hour)), aes(x = Accident_hour)) +
geom_bar() +
ggtitle("Number of Accidents per Hour")

Graph 5: Distribution of Age of Persons Involved in Accidents

ggplot(Names_Changed %>% filter(!is.na(Age)), aes(x = Age)) +
geom_histogram(binwidth = 1) +
ggtitle("Distribution of Age of Persons Involved in Accidents")

Graph 6: Number of Accidents by Sex

ggplot(Names_Changed %>% filter(!is.na(Sex)), aes(x = Sex)) +
geom_bar() +
ggtitle("Number of Accidents by Sex")

Graph 7: Number of Accidents with Alcohol Involvement (Should go into this more by selecting just accidents involving alcohol)

ggplot(Names_Changed %>% filter(!is.na(Alcohol_involvement)), aes(x = Alcohol_involvement)) +
geom_bar() +
ggtitle("Number of Accidents with Alcohol Involvement")

Graph 8: Distribution of Injury Severity

ggplot(Names_Changed %>% filter(!is.na(Injury_severity)), aes(x = Injury_severity)) +
geom_bar() +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
ggtitle("Distribution of Injury Severity")

Graph 9: Number of Accidents by Manner of Collision

ggplot(Names_Changed %>% filter(!is.na(Manner_of_collision)), aes(x = Manner_of_collision)) +
geom_bar() +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
ggtitle("Number of Accidents by Manner of Collision")

Graph 10: Number of Accidents by Vehicle Body Type - Somewhat redundant given what is below

accidents_by_body_type_injury_sex <- Names_Changed %>%
  filter(!is.na(Vehicle_body_type), !is.na(Injury_severity)) %>%
  group_by(Vehicle_body_type, Injury_severity, Sex, Alcohol_involvement) %>%
  summarize(n = n()) %>% 
  mutate(Vehicle_body_type = case_when(
    grepl("sedan|coupe|convertible|hatchback|automobile|compact", Vehicle_body_type, ignore.case = TRUE) ~ "Automobiles",
    grepl("motorcycle|scooter|moped", Vehicle_body_type, ignore.case = TRUE) ~ "Motorcycles and Scooters",
    grepl("van|minivan", Vehicle_body_type, ignore.case = TRUE) ~ "Vans and Minivans",
    grepl("pickup|utility|truck", Vehicle_body_type, ignore.case = TRUE) ~ "Trucks",
    grepl("bus", Vehicle_body_type, ignore.case = TRUE) ~ "Buses",
    grepl("ATV|snowmobile|golf cart|off-highway", Vehicle_body_type, ignore.case = TRUE) ~ "Specialized Vehicles",
    TRUE ~ "Unknown/Other Vehicle Types"
  )) %>% 
  filter(n > 4000) %>% 
  mutate(Injury_severity = factor(Injury_severity, levels = c("No Apparent Injury", "Possible Injury", "Suspected Minor Injury", "Suspected Serious Injury", "Fatal Injury", "Injured, Severity Unknown", "Died Prior to Crash"), ordered = TRUE))
## `summarise()` has grouped output by 'Vehicle_body_type', 'Injury_severity',
## 'Sex'. You can override using the `.groups` argument.
ggplot(accidents_by_body_type_injury_sex, aes(x = Vehicle_body_type)) +
geom_bar() +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
ggtitle("Number of Accidents by Vehicle Body Type")

Graph 11: Age Distribution of Persons Involved in Accidents

ggplot(Names_Changed, aes(x = Age)) +
geom_histogram(binwidth = 1) +
labs(x = "Age", y = "Count", title = "Age Distribution of Persons Involved in Accidents")

Graph 12: Age Distribution by Injury Severity

ggplot(Names_Changed, aes(x = Injury_severity, y = Age)) +
geom_boxplot() +
labs(x = "Injury Severity", y = "Age", title = "Age Distribution by Injury Severity") +
theme(axis.text.x = element_text(angle = 90, hjust = 1))

Graph 13: Manner of Collision Distribution

ggplot(Names_Changed, aes(x = Manner_of_collision)) +
geom_bar() +
labs(x = "Manner of Collision", y = "Count", title = "Manner of Collision Distribution") +
theme(axis.text.x = element_text(angle = 90, hjust = 1))

Graph 14: Number of Accidents by Hour of the Day and Month

accidents_by_hour_month <- Names_Changed %>%
  group_by(Accident_hour, Month_Name) %>%
  summarize(Count = n()) %>%
  ungroup()
## `summarise()` has grouped output by 'Accident_hour'. You can override using the
## `.groups` argument.
ggplot(accidents_by_hour_month, aes(x = Accident_hour, y = Count, fill = factor(Month_Name))) +
geom_col(position = "stack") +
labs(title = "Number of Accidents by Hour of the Day and Month") +
scale_fill_discrete(name = "Hour of the Day") +
theme_minimal()

Graph 15: Relationship between Age and Number of Persons Involved in Accidents

ggplot(Names_Changed, aes(x = Age, y = Number_of_persons_involved)) +
geom_point(alpha = 0.3) +
labs(x = "Age", y = "Number of persons involved", title = "Relationship between Age and Number of Persons Involved in Accidents")

Graph 16: Relationship between Age and Number of Persons Involved in Accidents

ggplot(Names_Changed, aes(x = Age, y = Number_of_persons_involved)) +
geom_point(alpha = 0.3) +
labs(x = "Age", y = "Number of persons involved", title = "Relationship between Age and Number of Persons Involved in Accidents")

Graph 17: Density of Accidents by Day of Month and Time of Day

ggplot(Names_Changed, aes(x = Accident_hour, y = Accident_day)) +
geom_bin2d(bins = 20) +
labs(x = "Hour of day", y = "Day of month", fill = "Number of accidents", title = "Density of Accidents by Day of Month and Time of Day")

Graph 18: Total Number of Accidents by Day of the Week

ggplot(Names_Changed, aes(x = Accident_weekday)) +
geom_bar() +
labs(x = "Day of week", y = "Number of accidents", title = "Total Number of Accidents by Day of the Week")

Graph 19: Total Number of Accidents by Year

ggplot(Names_Changed, aes(x = Accident_year)) +
geom_bar() +
labs(x = "Year", y = "Number of accidents", title = "Total Number of Accidents by Year")

Graph 20: Relationship between Age and Time of Day When Accidents Occur - why 24 and 0 different? FIX THIS

# ggplot(Names_Changed, aes(x = Accident_hour, y = Age)) +
# geom_point(alpha = 0.003) +
# labs(x = "Hour of day", y = "Age", title = "Relationship between Age and Time of Day When Accidents Occur")
ggplot(Names_Changed, aes(x = Accident_hour, y = Age)) +
  geom_point(alpha = 0.003) +
  labs(x = "Hour of day", y = "Age", title = "Relationship between Age and Time of Day When Accidents Occur") +
  scale_alpha(trans = "log", range = c(0.005, 0.5))

Graph 21: Number of Accidents by Vehicle Body Type and Injury Severity

ggplot(accidents_by_body_type_injury_sex, aes(x = Vehicle_body_type, fill = Injury_severity)) +
geom_bar(position = "stack") +
scale_fill_brewer(type = "div", palette = "RdYlBu", direction = -1) +
labs(x = "Vehicle body type", y = "Number of accidents", title = "Number of Accidents by Vehicle Body Type and Injury Severity")+
theme(axis.text.x = element_text(angle = 90, hjust = 1))

Graph 22: Distribution of Age by Sex Involved in Accidents

ggplot(Names_Changed, aes(x = Sex, y = Age)) +
geom_boxplot() +
labs(x = "Sex", y = "Age", title = "Distribution of Age by Sex Involved in Accidents")

Graph 23: Number of Accidents by Month and Injury Severity

ggplot(Names_Changed, aes(x = Month_Name, fill = Injury_severity)) +
geom_bar() +
labs(x = "Month", y = "Number of accidents", title = "Number of Accidents by Month and Injury Severity") +
theme(axis.text.x = element_text(angle = 90, hjust = 1))

Graph 24: Distribution of Number of Persons Involved in Accidents

ggplot(subset(Names_Changed, Number_of_persons_involved < 10), aes(x = Number_of_persons_involved)) +
geom_density() +
labs(x = "Number of persons involved", y = "Density", title = "Distribution of Number of Persons Involved in Accidents")

Graph 25: Distribution of Age by Manner of Collision

ggplot(Names_Changed, aes(x = Manner_of_collision, y = Age)) +
geom_boxplot() +
labs(x = "Manner of collision", y = "Age", title = "Distribution of Age by Manner of Collision")

Graph 26: Heatmap of Accidents by Hour and Day of the Week

accidents_by_hour_weekday <- Names_Changed %>%
group_by(hour = hour(Datetime), weekday = wday(Datetime, label = TRUE)) %>%
summarize(n = n())
## `summarise()` has grouped output by 'hour'. You can override using the
## `.groups` argument.
ggplot(accidents_by_hour_weekday, aes(x = hour, y = weekday, fill = n)) +
geom_tile() +
scale_fill_gradient(low = "#f7f7f7", high = "#e31a1c") +
labs(x = "Hour of day", y = "Day of week", fill = "Number of accidents", title = "Heatmap of Accidents by Hour and Day of the Week")

Graph 27: Accidents by Vehicle Body Type, Injury Severity, and Sex (double check classifications with someone else)

ggplot(accidents_by_body_type_injury_sex, aes(x = Vehicle_body_type, y = n, fill = Injury_severity))+
geom_col(position = "dodge") +
facet_wrap(~ Sex, scales = "free_y") +
  scale_fill_brewer(type = "div", palette = "RdYlBu", direction = -1) +
labs(x = "Vehicle body type", y = "Number of accidents", fill = "Injury severity", title = "Accidents by Vehicle Body Type, Injury Severity, and Sex")+
theme(axis.text.x = element_text(angle = 90, hjust = 1))

Graph 28: Relationship between Hour of Day and Age of Persons Involved in Accidents, Colored by Injury Severity and Faceted by Sex

ggplot(Names_Changed %>% filter(!is.na(Sex)), aes(x = Accident_hour, y = Age, color = Injury_Severity_Basic)) +
  geom_point(alpha = 0.005) +
  facet_wrap(~ Sex) +
  labs(x = "Hour of day", y = "Age", color = "Injury severity", title = "Relationship between Hour of Day and Age of Persons Involved in Accidents, Colored by Injury Severity and Faceted by Sex")+
  guides(color = guide_legend(override.aes = list(alpha = 1)))

Graph 29: Visualizing the Frequency of Accidents Throughout the Year

accidents_per_day <- Names_Changed %>%
mutate(DoY = yday(Datetime), Month = month(Datetime), Day = day(Datetime), Hour = hour(Datetime)) %>%
group_by(Month, Day, Hour) %>%
summarize(count = n()) %>%
ungroup()
## `summarise()` has grouped output by 'Month', 'Day'. You can override using the
## `.groups` argument.
ggplot(accidents_per_day, aes(x = Day, y = Month, fill = count)) +
geom_tile(color = "white") +
scale_y_continuous(trans = 'reverse', breaks = 1:12, labels = month.abb) +
labs(x = "Day", y = "Month", fill = "Accidents", title = "Visualizing the Frequency of Accidents Throughout the Year") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))

Graph 30: Analyzing the Relationship Between Time and Accidents Across Different Months

ggplot(accidents_per_day, aes(x = Day, y = Hour, fill = count - mean(count))) +
geom_tile(width = 0.5) +
scale_y_reverse() +
scale_fill_gradient2(low = "blue", mid = "white", high = "red", midpoint = 0) +
facet_wrap(~ month(Month, label = TRUE)) +
theme(axis.text.x = element_text(angle = 90, vjust = 0, hjust = 1)) +
labs(x = "Day", y = "Time", fill = "Accidents", title = "Analyzing the Relationship Between Time and Accidents Across Different Months")

Graph 31: Bivariate relationships - Scatter plot

ggplot(data = Names_Changed, aes(x = Age, y = Number_of_persons_involved)) + 
  geom_point(alpha = 0.05) + 
  theme_minimal()
Scatter plot showing the relationship between the age of individuals involved in accidents and the number of persons involved.

Scatter plot showing the relationship between the age of individuals involved in accidents and the number of persons involved.

Graph 32: Time-based analysis - Seasonal decomposition plot

monthly_accidents <- Names_Changed %>%
  mutate(Month = floor_date(Datetime, "month")) %>%
  count(Month)

ggplot(monthly_accidents, aes(x = Month, y = n)) +
  geom_line() +
  theme_minimal() +
  labs(x = "Year", y = "Number of Accidents")
Seasonal decomposition plot showing the trend of accidents over time, highlighting any seasonality in the data.

Seasonal decomposition plot showing the trend of accidents over time, highlighting any seasonality in the data.

Graph 33: Number of vehicles and people

ggplot(Names_Changed, aes(x= Number_of_vehicles_involved, y = Number_of_persons_involved))+
  geom_point(alpha = 0.1)
Scatter plot showing the relationship between the number of vehicles and the number of persons involved in accidents.

Scatter plot showing the relationship between the number of vehicles and the number of persons involved in accidents.

Graph 34: Density plot of Age

ggplot(data = Names_Changed, aes(x = Age)) +
  geom_density(fill = "lightblue", alpha = 0.6) +
  theme_minimal() +
  labs(x = "Age", y = "Density")
Density plot showing the distribution of age of individuals involved in accidents.

Density plot showing the distribution of age of individuals involved in accidents.

Graph 35: Age vs Number of Persons Involved by Sex

ggplot(data = Names_Changed, aes(x = Age, y = Number_of_persons_involved)) +
  geom_point(alpha = 0.2) +
  facet_wrap(~ Sex) +
  theme_minimal() +
  labs(x = "Age", y = "Number of Persons Involved")
Scatter plot showing the relationship between age and number of persons involved in accidents, separated by sex.

Scatter plot showing the relationship between age and number of persons involved in accidents, separated by sex.

Graph 36: Violin plot of Age by Sex

ggplot(data = Names_Changed, aes(x = Sex, y = Age, fill = Sex)) +
  geom_violin() +
  theme_minimal() +
  labs(x = "Sex", y = "Age")
Violin plot showing the distribution of age of individuals involved in accidents, separated by sex.

Violin plot showing the distribution of age of individuals involved in accidents, separated by sex.

Graph 37: Hexbin plot of Age vs Number of Persons Involved

ggplot(data = Names_Changed, aes(x = Age, y = Number_of_persons_involved)) +
  geom_hex(bins = 30) +
  theme_minimal() +
  labs(x = "Age", y = "Number of Persons Involved")
Hexbin plot showing the density of points in a hexagonal grid, with the x-axis representing age and y-axis representing the number of persons involved in accidents.

Hexbin plot showing the density of points in a hexagonal grid, with the x-axis representing age and y-axis representing the number of persons involved in accidents.

Graph 38: Ridge plot of Age distribution by Injury Severity (Why are these different?)

library(ggridges)
ggplot(data = Names_Changed, aes(x = Age, y = Injury_severity, fill = Injury_severity)) +
  geom_density_ridges() +
  theme_minimal() +
  labs(x = "Age", y = "Injury Severity")
## Picking joint bandwidth of 2.5
Ridge plot showing the distribution of age of individuals involved in accidents, separated by injury severity.

Ridge plot showing the distribution of age of individuals involved in accidents, separated by injury severity.

ggplot(data = Names_Changed, aes(x = Age, y = ..count.., fill = Injury_severity)) +
  stat_density(geom = "line") +
  scale_fill_discrete(name = "Injury Severity") +
  labs(x = "Age", y = "Count of Rows") +
  theme_minimal()+
  facet_wrap(~ Injury_severity, scales = "free")+
  xlim(0, 100)
## Warning: The dot-dot notation (`..count..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(count)` instead.
Ridge plot showing the distribution of age of individuals involved in accidents, separated by injury severity.

Ridge plot showing the distribution of age of individuals involved in accidents, separated by injury severity.

Graph 39: Bar plot of Number of Accidents by Vehicle Body Type, faceted by Alcohol Involvement (Double check accidents_by_body_type_injury_sex to make sure this is working properly)

ggplot(data = accidents_by_body_type_injury_sex, aes(x = Vehicle_body_type, y = n)) +
  geom_col() +
  facet_wrap(~ Alcohol_involvement) +
  theme_minimal() +
  labs(x = "Vehicle Body Type", y = "Number of Accidents") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))
Bar plot showing the number of accidents by vehicle body type, faceted by alcohol involvement.

Bar plot showing the number of accidents by vehicle body type, faceted by alcohol involvement.

Graph 40: Scatter plot with jitter for the Number of Persons Involved in Accidents by Hour, colored by Sex

ggplot(data = Names_Changed, aes(x = Accident_hour, y = Number_of_persons_involved, color = Sex)) +
  geom_jitter(alpha = 0.1, width = 0.2, height = 0.2) +
  theme_minimal() +
  labs(x = "Hour of the Day", y = "Number of Persons Involved")
Scatter plot showing the relationship between the hour of the day and the number of persons involved in accidents, colored by sex.

Scatter plot showing the relationship between the hour of the day and the number of persons involved in accidents, colored by sex.

Graph 41: Bubble plot for the number of accidents by state and injury severity - Not sure this is the best way to display this information, maybe have this for regions…

state_injury_severity <- Names_Changed %>% count(state_name, Injury_severity)

ggplot(data = state_injury_severity, aes(x = state_name, y = Injury_severity, size = n, color = Injury_severity)) +
  geom_point(alpha = 0.5) +
  theme_minimal() +
  labs(x = "State", y = "Injury Severity") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))
Bubble plot showing the number of accidents by state and injury severity, with bubble size indicating the number of accidents and color indicating injury severity.

Bubble plot showing the number of accidents by state and injury severity, with bubble size indicating the number of accidents and color indicating injury severity.

Graph 42: Bar plot for the number of accidents by month and year

monthly_accidents <- Names_Changed %>% count(Year = year(Datetime), Month = month(Datetime))

ggplot(data = monthly_accidents, aes(x = month(Month, label = TRUE), y = n, fill = as.factor(Year))) +
  geom_bar(stat = "identity", position = "dodge") +
  theme_minimal() +
  labs(x = "Month", y = "Number of Accidents", fill = "Year")
Bar plot showing the number of accidents by month and year.

Bar plot showing the number of accidents by month and year.

Graph 43: Ridgeline plot of the number of accidents by state and hour. RI and DC have some issues with early morning accidents!

ggplot(Names_Changed, aes(x = Accident_hour, y = state_name)) +
    ggridges::geom_density_ridges() +
    labs(title = "Ridgeline Plot: Accident Hour", x = "Hour", y = "State") +
    theme_ridges()
## Picking joint bandwidth of 0.819
Ridgeline plot of number of accidents by state and hour

Ridgeline plot of number of accidents by state and hour

Graph 44: Boxplot of Age by Injury Severity

ggplot(Names_Changed, aes(x = Injury_severity, y = Age, fill = Injury_severity)) +
  geom_boxplot() +
  labs(title = "Boxplot of Age by Injury Severity", x = "Injury Severity", y = "Age") +
  theme_minimal()
Boxplot of Age by Injury Severity

Boxplot of Age by Injury Severity

Graph 45: Normality: Q-Q plot of Age

ggplot(Names_Changed, aes(sample = Age)) +
  geom_qq() +
  geom_qq_line(color = "red") +
  labs(title = "Q-Q Plot of Age", x = "Theoretical Quantiles", y = "Sample Quantiles") +
  theme_minimal()

Graph 46: Linearity: Scatterplot of Age vs. Number_of_persons_involved with a LOESS fit (takes forever to render)

# ggplot(Names_Changed, aes(x = Age, y = Number_of_persons_involved)) +
#   geom_point(alpha = 0.2, aes(color = Sex)) +
#   geom_smooth(method = "loess", se = FALSE, color = "red") +
#   labs(title = "Scatterplot of Age vs. Number of Persons Involved with LOESS fit", x = "Age", y = "Number of Persons Involved") +
#   theme_minimal()

Graph 47: Homoscedasticity: Residual plot (using a linear model as an example) (takes forever to render)

# # Fit a linear model
# model <- lm(Number_of_persons_involved ~ Age, data = Names_Changed)
# 
# # Create a residual plot
# ggplot(data.frame(residuals = resid(model), fitted = fitted(model)), aes(x = fitted, y = residuals)) +
#   geom_point(alpha = 0.2) +
#   geom_smooth(se = FALSE, color = "red", method = "loess") +
#   labs(title = "Residual Plot", x = "Fitted Values", y = "Residuals") +
#   theme_minimal()

Graph 48: Density plot of Age (to check the distribution)

ggplot(Names_Changed, aes(x = Age)) +
  geom_density(fill = "steelblue", alpha = 0.5) +
  labs(title = "Density Plot of Age", x = "Age", y = "Density") +
  theme_minimal()

Next split into states or reigions like New England where hours of daylight will change and may cause more accidents

Names_Changed <- Names_Changed %>%
  mutate(region = ifelse(state_name %in% c("Maine", "New Hampshire", "Vermont", "Massachusetts", "Rhode Island", "Connecticut"), "Northeast",
  ifelse(state_name %in% c("New York", "Pennsylvania", "New Jersey"), "Northeast",
  ifelse(state_name %in% c("Wisconsin", "Michigan", "Illinois", "Indiana", "Ohio"), "Midwest",
  ifelse(state_name %in% c("Minnesota", "Iowa", "Missouri", "North Dakota", "South Dakota", "Nebraska", "Kansas"), "Midwest",
  ifelse(state_name %in% c("Delaware", "Maryland", "District of Columbia", "Virginia", "West Virginia", "North Carolina", "South Carolina", "Georgia", "Florida"), "South",
  ifelse(state_name %in% c("Kentucky", "Tennessee", "Mississippi", "Alabama", "Oklahoma", "Texas", "Arkansas", "Louisiana"), "South",
  ifelse(state_name %in% c("Montana", "Idaho", "Wyoming", "Colorado", "New Mexico", "Arizona", "Utah", "Nevada"), "West",
  ifelse(state_name %in% c("Washington", "Oregon", "California", "Alaska", "Hawaii"), "West", NA_character_)))))))))

Do teens have more accidents on weekends? (Subset data or mutate a column identifying weekends and facet)

What leads to fatal accidents? (Subset data or mutate a column identifying fatal and facet)

Moving window correlation between Number_of_persons_involved and Number_of_vehicles_involved

SingleWindow <- function(statistic, data) {
  if (!is.data.frame(data)) {
    stop("The input data must be a dataframe.")
  }
  switch(statistic,
         "correlation" = list(Value = cor(data[,2], data[,3]), Midpoint = data[ceiling(nrow(data)/2), 1]),
         "p value" = list(Value = cor.test(data[,2], data[,3])$p.value, Midpoint = data[ceiling(nrow(data)/2), 1]),
         "OLS" = list(Value = summary(lm(data[,3] ~ data[,2]))$coefficients, Midpoint = data[ceiling(nrow(data)/2), 1]),
         stop("The input statistic must be one of 'correlation', 'p value', or 'OLS'.")
  )
}


MovingWindow <- function(data, posix_col, col1, col2, window_days, step_size, statistic) {
  # Ensure the input POSIX column is in POSIXct format
  data[[posix_col]] <- as.POSIXct(data[[posix_col]])
  
  # Calculate the minimum and maximum POSIX values in the dataset
  min_date <- min(data[[posix_col]])
  max_date <- max(data[[posix_col]])
  
  # Convert window_days and step_size to seconds
  window_seconds <- window_days * 24 * 60 * 60
  step_seconds <- step_size * 24 * 60 * 60
  
  # Set the starting date for the moving window
  current_date <- min_date
  
  results = list()
  # Iterate through the dataset using the moving window
  while (current_date + window_seconds <= max_date) {
    # Filter the data within the moving window
    window_data <- data[data[[posix_col]] >= current_date & data[[posix_col]] <= current_date + window_seconds, c(posix_col, col1, col2)]
    
    # Select the posix_col, col1, and col2 using dplyr and set it equal to win_val
    win_val <- window_data %>%
      select(all_of(c(posix_col, col1, col2)))
    
    # Print the win_val dataframe for the current window
    # print(colnames(win_val))
    result <- SingleWindow(statistic, win_val)
    results <- rbind(results, data.frame(DateTime = current_date,
                                         Value = result$Value[1],
                                         Midpoint = as.POSIXct(result$Midpoint$Datetime)))
    
    # Move the window by step_size
    current_date <- current_date + step_seconds
    
  }
  
  # print(results)
  ggplot(data = results, aes(x=Midpoint,Value))+
    geom_point()
  
}

MovingWindow(Names_Changed, "Datetime", "Number_of_persons_involved", "Number_of_vehicles_involved", 30, 15, "correlation")